In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy import stats
import warnings
from sklearn.preprocessing import OneHotEncoder
warnings.filterwarnings('ignore')
#from sklearn.model_selection import train_test_split
In [2]:
def pretty_print_linear(coefs, names = None, sort = False):
    if names is None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst, key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (coef , name) for coef, name in lst)
In [3]:
def scale_data(X):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X
In [4]:
def split_data(X,Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
    return X_train, X_test, Y_train, Y_test
In [5]:
def root_mean_square_error(y_pred,y_test):
    rmse_train = np.sqrt(np.dot(abs(y_pred-y_test),abs(y_pred-y_test))/len(y_test))
    return rmse_train
In [6]:
def plot_real_vs_predicted(y_pred,y_test):
    plt.plot(y_pred,y_test,'ro')
    plt.plot([0,50],[0,50], 'g-')
    plt.xlabel('predicted')
    plt.ylabel('real')
    plt.show()
    return plt
In [7]:
def generate_regression_values(model, X, y):
    params = np.append(model.intercept_, model.coef_)
    predictions = model.predict(X)
    newX = pd.DataFrame({"Constant": np.ones(len(X))}).join(pd.DataFrame(X))
    MSE = (sum((y - predictions) ** 2)) / (len(newX) - len(newX.columns))

    # Note if you don't want to use a DataFrame replace the two lines above with
    # newX = np.append(np.ones((len(X),1)), X, axis=1)
    # MSE = (sum((y-predictions)**2))/(len(newX)-len(newX[0]))

    var_b = MSE * (np.linalg.inv(np.dot(newX.T, newX)).diagonal())
    sd_b = np.sqrt(var_b)
    ts_b = params / sd_b

    p_values = [2 * (1 - stats.t.cdf(np.abs(i), (len(newX) - 1))) for i in ts_b]

    sd_b = np.round(sd_b, 3)
    ts_b = np.round(ts_b, 3)
    p_values = np.round(p_values, 3)
    params = np.round(params, 4)

    myDF3 = pd.DataFrame()
    myDF3["Coefficients"], myDF3["Standard Errors"], myDF3["t values"], myDF3[
        "Probabilites"
    ] = [params, sd_b, ts_b, p_values]
    print(myDF3)
In [ ]:
 
In [8]:
health_score = pd.read_csv('Synthetic_Health_Data.csv').drop(['Unnamed: 0'],axis=1)
health_score.head(5)
Out[8]:
Age Height Weight BMI Fat% sleep_hours Exercise_Time gender Glasses_Of_Water_PerWeek Alcohol_Servings_PerWeek ... 30min_Cardio_PerWeek Intensity_of_Cardio Strength_Training_PerWeek Smoke_PerDay Chew_Tobacco Healthy_WorkLife_Balance Stressed steps_PerMonth sitting_hours health_score
0 37 166 82.080267 28.256774 18.947780 5.296131 0.106583 female 0-5 glass none ... More than 3 times a week Light Light 15+ cigrattes Yes Sometimes Sometimes 87612 2 45.653149
1 26 167 63.702687 22.286450 1.602097 5.619734 1.057065 male 6-9 glass none ... 1-2 times week Very Hard No Never Smoked No No No 73339 10 49.455607
2 32 177 66.277449 21.593621 27.185321 5.521740 0.831857 female 0-5 glass none ... 1-2 times week Light Hard Never Smoked Yes Yes No 106329 11 54.382471
3 44 178 86.076920 27.062654 12.032952 8.714550 0.028077 male 0-5 glass 4+ servings ... Never or Rarely Hard Very Hard Quit smoking No Sometimes Yes 121816 4 55.461273
4 25 153 36.960478 16.427001 15.187378 6.295620 0.103930 female 6-9 glass 4+ servings ... Never or Rarely No No 15+ cigrattes No No Yes 128999 2 38.095238

5 rows × 26 columns

Exploratory Data Analytics

In [14]:
import matplotlib.pyplot as plt, seaborn as sns
import matplotlib.style as style

sns.pairplot(health_score, hue ='gender')
Out[14]:
<seaborn.axisgrid.PairGrid at 0x246f88e5f90>
In [15]:
import matplotlib.pyplot as plt, seaborn as sns
import matplotlib.style as style

sns.pairplot(health_score, hue ='Smoke_PerDay')
Out[15]:
<seaborn.axisgrid.PairGrid at 0x246fff56c50>
In [9]:
cat_cols = ['gender',
'Glasses_Of_Water_PerWeek',
'Alcohol_Servings_PerWeek',
'Three_or_More_Servings_of_whole_grain_perDay',
'Eat_Nuts_or_Fish_2_or_more_times_perWeek',
'Butter_or_Cheese_or_Cream_Milk_Or_Curd_2_or_more_times_perWeek',
'Sweet_Treats_Consumption_on_most_days_of_the_week',
'Fried_or_Junk_food_Consumption_on_most_days_of_the_week',
'Five_or_More_fruits_and_vegetables_perDay',
'30min_Cardio_PerWeek',
'Intensity_of_Cardio',
'Strength_Training_PerWeek',
'Smoke_PerDay',
'Chew_Tobacco',
'Healthy_WorkLife_Balance',
'Stressed']

for col in cat_cols:
    health_score[col] = pd.Categorical(health_score[col], categories=health_score[col].unique()).codes
In [10]:
health_score.head(5)
Out[10]:
Age Height Weight BMI Fat% sleep_hours Exercise_Time gender Glasses_Of_Water_PerWeek Alcohol_Servings_PerWeek ... 30min_Cardio_PerWeek Intensity_of_Cardio Strength_Training_PerWeek Smoke_PerDay Chew_Tobacco Healthy_WorkLife_Balance Stressed steps_PerMonth sitting_hours health_score
0 37 166 82.080267 28.256774 18.947780 5.296131 0.106583 0 0 0 ... 0 0 0 0 0 0 0 87612 2 45.653149
1 26 167 63.702687 22.286450 1.602097 5.619734 1.057065 1 1 0 ... 1 1 1 1 1 1 1 73339 10 49.455607
2 32 177 66.277449 21.593621 27.185321 5.521740 0.831857 0 0 0 ... 1 0 2 1 0 2 1 106329 11 54.382471
3 44 178 86.076920 27.062654 12.032952 8.714550 0.028077 1 0 1 ... 2 2 3 2 1 0 2 121816 4 55.461273
4 25 153 36.960478 16.427001 15.187378 6.295620 0.103930 0 1 1 ... 2 3 1 0 1 1 2 128999 2 38.095238

5 rows × 26 columns

Linear Regression

In [11]:
health_score.drop(['Age', 'Height', 'Weight', 'Fat%','gender'],axis=1,inplace=True)
health_score.head(5)
Out[11]:
BMI sleep_hours Exercise_Time Glasses_Of_Water_PerWeek Alcohol_Servings_PerWeek Three_or_More_Servings_of_whole_grain_perDay Eat_Nuts_or_Fish_2_or_more_times_perWeek Butter_or_Cheese_or_Cream_Milk_Or_Curd_2_or_more_times_perWeek Sweet_Treats_Consumption_on_most_days_of_the_week Fried_or_Junk_food_Consumption_on_most_days_of_the_week ... 30min_Cardio_PerWeek Intensity_of_Cardio Strength_Training_PerWeek Smoke_PerDay Chew_Tobacco Healthy_WorkLife_Balance Stressed steps_PerMonth sitting_hours health_score
0 28.256774 5.296131 0.106583 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 87612 2 45.653149
1 22.286450 5.619734 1.057065 1 0 0 1 0 0 1 ... 1 1 1 1 1 1 1 73339 10 49.455607
2 21.593621 5.521740 0.831857 0 0 1 2 0 1 1 ... 1 0 2 1 0 2 1 106329 11 54.382471
3 27.062654 8.714550 0.028077 0 1 1 2 1 2 1 ... 2 2 3 2 1 0 2 121816 4 55.461273
4 16.427001 6.295620 0.103930 1 1 2 0 1 1 1 ... 2 3 1 0 1 1 2 128999 2 38.095238

5 rows × 21 columns

In [12]:
Y=health_score['health_score']
X=health_score.drop(['health_score'],axis=1)
In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y ,random_state=104,test_size=0.25,shuffle=True)
#X_train= X_train.values.reshape(-1, 1)
#X_test = X_test.values.reshape(-1, 1)
In [19]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)
(2250, 20)
(750, 20)
(2250,)
(750,)
In [20]:
from sklearn import metrics
In [22]:
# Create linear regression object
linreg = LinearRegression()

# Train the model using the training sets
linreg.fit(X_train,Y_train)

print ("Linear model: ", pretty_print_linear(linreg.coef_, X.columns, sort = True))

# Predict the values using the model
Y_lin_predict = linreg.predict(X_test)
#print(linreg.score(X_test, Y_test))
# Print the root mean square error 
#print ("Root Mean Square Error: {}".format(root_mean_square_error(Y_lin_predict,Y_test)))
#plot_real_vs_predicted(Y_test,Y_lin_predict)
Linear model:  -4.848617224854395 * 30min_Cardio_PerWeek + 4.7868889669633665 * Chew_Tobacco + 3.0946881325138045 * Exercise_Time + 2.5569400557061015 * Strength_Training_PerWeek + 2.1586465302020272 * Glasses_Of_Water_PerWeek + 1.406972030131514 * Eat_Nuts_or_Fish_2_or_more_times_perWeek + 1.3276233924417706 * Fried_or_Junk_food_Consumption_on_most_days_of_the_week + -1.2315789840104598 * Alcohol_Servings_PerWeek + 1.2063649658481126 * Sweet_Treats_Consumption_on_most_days_of_the_week + -1.1455053784707674 * Butter_or_Cheese_or_Cream_Milk_Or_Curd_2_or_more_times_perWeek + 1.1415350083619613 * Five_or_More_fruits_and_vegetables_perDay + 1.1323464648438006 * Stressed + -1.0249103074577206 * Intensity_of_Cardio + 1.0101659157820206 * Three_or_More_Servings_of_whole_grain_perDay + 0.925866882468655 * Healthy_WorkLife_Balance + 0.5907507838792576 * Smoke_PerDay + -0.25265598530568806 * BMI + -0.18660498640626255 * sleep_hours + -0.11123987871252386 * sitting_hours + 3.8804615679001735e-05 * steps_PerMonth
In [23]:
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
print("R^2 : ", r2_score(Y_test, Y_lin_predict))
print("MAE :", mean_absolute_error(Y_test,Y_lin_predict))
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_lin_predict)))
R^2 :  0.5023051414196489
MAE : 5.736300201227288
RMSE: 7.153884198323651
In [24]:
fig, ax = plt.subplots()
ax.scatter(Y_lin_predict, Y_test, edgecolors=(0, 0, 1))
ax.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'r--', lw=3)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.show()

Decision trees

In [40]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=44,max_depth = 10)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
In [41]:
from sklearn.tree import plot_tree
plt.figure(figsize=(10,8), dpi=150)
plot_tree(model, feature_names=X.columns);
In [42]:
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
print("R^2 : ", r2_score(Y_test, predictions))
print("MAE :", mean_absolute_error(Y_test,predictions))
print("RMSE:",np.sqrt(mean_squared_error(Y_test, predictions)))
R^2 :  0.2088128598057699
MAE : 7.272796033185009
RMSE: 9.019862944281558
In [43]:
fig, ax = plt.subplots()
ax.scatter(predictions, Y_test, edgecolors=(0, 0, 1))
ax.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'r--', lw=3)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.show()
In [44]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=44,max_depth = 3)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
In [45]:
from sklearn.tree import plot_tree
plt.figure(figsize=(10,8), dpi=150)
plot_tree(model, feature_names=X.columns);
In [ ]: